# coding:utf-8
# 采用gensim去做word2vec的处理,使用sklearn的SVM进行分类


import warnings
warnings.filterwarnings("ignore")
import myWord2vec as WV

import sys
import jieba
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.externals import joblib
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split



'''
function: 载入数据做预处理(分词)，切分训练集与预测集
return: 返回训练集和测试集
'''
def load_file_and_preprocessing():
    neg = pd.read_excel("data/neg.xls",header=None,index=None)
    pos = pd.read_excel("data/pos.xls",header=None,index=None)
 
    cw = lambda x: list(jieba.cut(x))
    pos['words'] = pos[0].apply(cw)
    neg['words'] = neg[0].apply(cw)
 
    #print(pos['words'])
    # use 1 for positive sentiment, 0 for negative
    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))    # 数组拼接
    # train_test_split函数用于将矩阵随机划分为训练子集和测试子集，并返回划分好的训练集测试集样本和训练集测试集标签
    x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2)
    np.save('svm_data/y_train.npy',y_train)
    np.save('svm_data/y_test.npy',y_test)
    return x_train,x_test



'''
function: 训练svm模型
'''
def svm_train(train_vecs, y_train, test_vecs, y_test):
    clf = SVC(kernel = 'rbf', verbose = True)
    clf.fit(train_vecs, y_train)
    joblib.dump(clf, 'svm_data/svm_model/model.pkl')
    print(clf.score(test_vecs, y_test))



'''
function: 初始化训练,第一次调用时生成model.pkl,后面就可以不用跑了
'''
def main():
    x_train,x_test = load_file_and_preprocessing()
    #get_train_vecs(x_train, x_test)
    WV.get_train_vecs(x_train, x_test)
    train_vecs,y_train,test_vecs,y_test = WV.get_data()
    svm_train(train_vecs, y_train, test_vecs, y_test)


main()







